rm(list = ls())
setwd("/Users/John/Dropbox/")

# --- Load Required Packages ---
library(readr)
library(tidyverse)
library(readxl)

# --- Load CCD ISIC data ---
ccd_isic <- read_csv("JOP_Replication_Materials/data/processed/ccd_isic_processing.csv")

# --- Load Chinese Import Data from Trademap and reshape ---
cn_imports <- read_excel("JOP_Replication_Materials/appendix/data/cn_imports.xlsx") %>%
  mutate(industry = str_remove(industry, "'")) %>%
  pivot_longer(cols = 2:21, names_to = "year", values_to = "cn_imports")

# --- Load Global Import Data from Trademap and reshape ---
w_imports <- read_excel("JOP_Replication_Materials/appendix/data/world_imports.xlsx") %>%
  mutate(industry = str_remove(industry, "'")) %>%
  pivot_longer(cols = 2:21, names_to = "year", values_to = "world_imports")

# --- Merge Chinese and Global Import Data ---
merge <- left_join(cn_imports, w_imports, by = c("year", "industry"))

# --- Load ISIC-HS4 Correspondence Table ---
cor <- read_csv("JOP_Replication_Materials/data/raw/isic_hs4_correspondence.csv") %>%
  mutate(across(c(isic, hs4), str_pad, width = 4, pad = "0"))

# --- Merge Import Data with Correspondence Table ---
merge2 <- merge %>%
  left_join(cor, by = c("industry" = "hs4"), relationship = "many-to-many") %>%
  filter(!is.na(isic)) %>%
  mutate(year = as.numeric(year))

# --- Aggregate Chinese and Global Imports by ISIC and Year ---
merge2 <- merge2 %>%
  group_by(isic, year) %>%
  mutate(
    cn_imp_isic = sum(na.omit(cn_imports)),
    w_imp_isic = sum(na.omit(world_imports))
  )

# --- Merge Import Data and CCD Data ---
merge3 <- merge2 %>%
  left_join(ccd_isic, by = c("isic", "year")) %>%
  select(year, isic, cn_imp_isic, w_imp_isic, processing_share) %>%
  distinct(isic, year, .keep_all = TRUE)

# --- Create Additional Variables ---
merge3 <- merge3 %>%
  mutate(
    ratio = processing_share / 100,
    ept_imports = ratio * cn_imp_isic,
    non_ept_imports = cn_imp_isic - ept_imports,
    nonept_imp_worldshare = non_ept_imports / w_imp_isic,
    total_cnimp_worldshare = cn_imp_isic / w_imp_isic
  )

# --- Load Full Dataset and Merge with Market Size Data ---
df <- read_csv("JOP_Replication_Materials/data/processed/final_dataset.csv") %>%
  left_join(merge3, by = c("isic", "year"))

# --- Create Additional Lead Variables ---
df <- df %>%
  group_by(isic) %>%
  mutate(
    nonept_share_lead4 = lead(nonept_imp_worldshare, 4),
    nonept_share_lead8 = lead(nonept_imp_worldshare, 8),
    total_cn_world_share4 = lead(total_cnimp_worldshare, 4),
    total_cn_world_share8 = lead(total_cnimp_worldshare, 8)
  ) %>%
  ungroup()

# --- Select Relevant Columns ---
df <- df %>%
  select(isic, year, strategic, isic_year, sqrtta, median_share, 
         med_hhi_isic2, med_soe_isic2, nonept_imp_worldshare, 
         nonept_share_lead4, nonept_share_lead8, total_cnimp_worldshare,
         total_cn_world_share4, total_cn_world_share8)

# --- Arrange the Data by ISIC and Year ---
df <- arrange(df, isic, year)

# --- Save to CSV ---
write.csv(df, "JOP_Replication_Materials/appendix/data/final_dataset_marketsize.csv")
